# -*- coding: utf-8 -*-
import json
import jieba
import pickle
from gensim import corpora, models, similarities
from os.path import exists
from warnings import filterwarnings
filterwarnings('ignore')  # 不打印警告

# 加载数据
wordslist = ["我在玉龙雪山我我我我",
             "我喜欢玉龙雪山",
             "我还要去玉龙雪山"] 
# 切词
textTest = [[word for word in jieba.cut(words)] for words in wordslist]
# 生成字典
dictionary = corpora.Dictionary(textTest)


for key in dictionary.iterkeys():
    print (key,dictionary.get(key),dictionary.dfs[key])
    
# 0 在 1
# 1 我 3
# 2 玉龙雪山 3
# 3 喜欢 1
# 4 去 1
# 5 还要 1
    
print("textTest = %s" % textTest)

# [
#  ['我', '在', '玉龙雪山', '我', '我', '我', '我'], 
#  ['我', '喜欢', '玉龙雪山'], 
#  ['我', '还要', '去', '玉龙雪山']
# ]

corpus = [dictionary.doc2bow(i) for i in textTest]
# corpus是一个返回bow向量的迭代器。下面代码将完成对corpus中出现的每一个特征的IDF值的统计工作
print(corpus)
# [
#  [(0, 1), (1, 5), (2, 1)],
#  [(1, 1), (2, 1), (3, 1)],
#  [(1, 1), (2, 1), (4, 1), (5, 1)]
# ]

tfidf = models.TfidfModel(corpus)
# 开始训练模型
print(tfidf)
# TfidfModel(num_docs=3, num_nnz=10)
corpus_tfidf = tfidf[corpus]
 
# 查看model中的内容
for item in corpus_tfidf:
    print(item)
    
# 通过token2id得到特征数
print(dictionary.token2id.keys())
# dict_keys(['在', '我', '玉龙雪山', '喜欢', '去', '还要'])
num = len(dictionary.token2id.keys())
print("特征数: %s" % (num))
# 特征数: 6

# 稀疏矩阵相似度，从而建立索引
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=num)


# 相似度测试
test_word = "我喜欢玉龙雪山"
cut_word = [word for word in jieba.cut(test_word)]
new_vec = dictionary.doc2bow(cut_word)
print(new_vec)
# [(1, 1), (2, 1), (3, 1)]
sim = index[tfidf[new_vec]]
print(sim)
